suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(plotly))
library(gapminder)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
library(forcats)

0. Introduction

This assignment aim to practice some factor management skills ,writing dataset and figures.

1. Factor Management

In this section, I will be using gapminder as the dataset to explore those factor features. ## 1.1 Drop factors/levels

Drop Oceania

The task discription: > Filter the Gapminder data to remove observations associated with the continent of Oceania. Additionally, remove unused factor levels. Provide concrete information on the data before and after removing these rows and Oceania; address the number of rows and the levels of the affected factors.

Before we drop Oceania, let us see the levels in continent.

# original levels in continent
gapminder$continent %>% 
  fct_count() %>% # count all levels in continent 
  knitr::kable(col.names = c("Countries","Counts"), 
               format = "markdown", 
               booktaps = TRUE,
               padding = 0)
Countries Counts
Africa 624
Americas 300
Asia 396
Europe 360
Oceania 24

Then I will filter out Oceania from continent and compare those two datasets.

# drop Oceania from continent 
gap_no_Oce = gapminder %>% 
  filter(continent != "Oceania")
# to see the levels in continent 
gap_no_Oce$continent %>% fct_count() %>% # count all levels in continent without oceania 
  knitr::kable(col.names = c("Factors","Counts"), 
               format = "markdown", 
               booktaps = TRUE,
               padding = 0)
Factors Counts
Africa 624
Americas 300
Asia 396
Europe 360
Oceania 0

we can see that except for Onceania, other levels are untouched. Now I am going to remove unused levels(i.e. Oceania in gap_no_Oce)

#drop extra levels
gap_drop_Oce = gap_no_Oce %>% 
  droplevels()
# to see the levels
gap_drop_Oce$continent %>% fct_count() %>% # count all levels in continent without oceania after droping 
  knitr::kable(col.names = c("Countries","Counts"), 
               format = "markdown", 
               booktaps = TRUE,
               padding = 0)
Countries Counts
Africa 624
Americas 300
Asia 396
Europe 360

Finally, I will compare the number of rows of those 3 datasets.

#printing the number of rows of 3 datasets
(c(nrow(gapminder),nrow(gap_no_Oce),nrow(gap_drop_Oce)))
## [1] 1704 1680 1680

Reorder the level of country and continent

The task discription: > Use the forcats package to change the order of the factor levels, based on a principled summary of one of the quantitative variables. Consider experimenting with a summary statistic beyond the most basic choice of the median.

I will reorder the continent by the IQR of gdpPercap and comparing two plots to see the chang of order in continents.

#plot after reorder by mean
plot1 = gapminder %>%
    mutate(continent = fct_reorder(continent, gdpPercap, IQR)) %>%
    ggplot(aes(continent, gdpPercap)) +
    geom_violin(fill="cornflowerblue")+
    geom_boxplot(fill= "lightblue",width=0.5,alpha=0.5,notch=TRUE)+
    geom_jitter(col="#0353A4", alpha= 0.1)+
    theme_bw()+
    labs(title="gdp change by year after reorder")
#plot before reorder
plot2 = gapminder %>%
    group_by(continent) %>% 
    ggplot(aes(continent, gdpPercap)) +
    geom_violin(fill="cornflowerblue")+
    geom_boxplot(fill= "lightblue",width=0.5,alpha=0.5,notch=TRUE)+
    geom_jitter(col="#0353A4", alpha= 0.1)+
    theme_bw()+
    labs(title="gdp change by year before reorder")
#put 2 plot together
grid.arrange(plot1,plot2,ncol = 2)

But if we use arrange here, the order of factors won’t change.

# arrange mean life expectancy
gapminder %>%
    group_by(continent) %>%
    summarise(meanLifeExp = mean(lifeExp)) %>%
    arrange(desc(continent)) %>%
    knitr::kable()
continent meanLifeExp
Oceania 74.32621
Europe 71.90369
Asia 60.06490
Americas 64.65874
Africa 48.86533
# boxplot after arrage
gapminder %>%
    group_by(continent, year) %>%
    summarise(meanLifeExp = mean(lifeExp)) %>%
    arrange(desc(continent)) %>%
    ggplot(aes(continent, y = meanLifeExp)) +
    geom_boxplot(fill= "lightblue",width=0.5,alpha=0.5)+
    theme_bw()+
    labs(title="meanLifeExp by continent after arrange")

2. File I/O

Task discription: > Experiment with one or more of write_csv()/read_csv() (and/or TSV friends), saveRDS()/readRDS(), dput()/dget(). Create something new, probably by filtering or grouped-summarization of Singer or Gapminder. I highly recommend you fiddle with the factor levels, i.e. make them non-alphabetical (see previous section). Explore whether this survives the round trip of writing to file then reading back in.

I am going to create a new factorial variable ecoLevel by comparing the worldwide mean of gdpPercap. I will filter all the data in 2007 to simply the dataset.

# to see the years factors
as.factor(gapminder$year) %>% fct_count() 
## # A tibble: 12 x 2
##    f         n
##    <fct> <int>
##  1 1952    142
##  2 1957    142
##  3 1962    142
##  4 1967    142
##  5 1972    142
##  6 1977    142
##  7 1982    142
##  8 1987    142
##  9 1992    142
## 10 1997    142
## 11 2002    142
## 12 2007    142
# filter out data in 2007
gap_2007 = gapminder %>% 
  filter(year== 2007)

# creating ecoLevel
gap_07eco = gap_2007 %>% 
  mutate(ecoLevel= factor(ifelse(gdpPercap>mean(gdpPercap),
                                             "high",
                                             "low")))

The write and read the data.

write_csv(gap_07eco,"gap_with_ecolevel.csv", append =FALSE, col_names = TRUE)
read_csv("gap_with_ecolevel.csv") %>% str()
## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_integer(),
##   lifeExp = col_double(),
##   pop = col_integer(),
##   gdpPercap = col_double(),
##   ecoLevel = col_character()
## )
## Classes 'tbl_df', 'tbl' and 'data.frame':    142 obs. of  7 variables:
##  $ country  : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ continent: chr  "Asia" "Europe" "Africa" "Africa" ...
##  $ year     : int  2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
##  $ lifeExp  : num  43.8 76.4 72.3 42.7 75.3 ...
##  $ pop      : int  31889923 3600523 33333216 12420476 40301927 20434176 8199783 708573 150448339 10392226 ...
##  $ gdpPercap: num  975 5937 6223 4797 12779 ...
##  $ ecoLevel : chr  "low" "low" "low" "low" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 7
##   .. ..$ country  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ continent: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ year     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ lifeExp  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ pop      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ gdpPercap: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ecoLevel : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

3. Visualization Design

Task discription:

Remake at least one figure or create a new one, in light of something you learned in the recent class meetings about visualization design and color. Maybe juxtapose your first attempt and what you obtained after some time spent working on it. Reflect on the differences. If using Gapminder, you can use the country or continent color scheme that ships with Gapminder.

I am going to explore the theme function in the former boxplot.

# original plot
p1= gapminder %>%
    group_by(continent, year) %>%
    summarise(meanLifeExp = mean(lifeExp)) %>%
    arrange(desc(continent)) %>%
    ggplot(aes(continent, y = meanLifeExp)) +
    geom_boxplot()


# creating my theme

mytheme = theme(plot.title = element_text(face="bold.italic",size= 14,color="brown"),
                axis.title=element_text(face="bold.italic",size=10, color = "brown"),
                axis.text=element_text(face="bold",size=9, color = "darkblue"),
                panel.background = element_rect(fill="white",color="darkblue"),
                panel.grid.major.y = element_line(color="grey",linetype = 1),
                panel.grid.minor.y = element_line(color="grey",linetype = 2),
                panel.grid.minor.x = element_blank(),legend.position = "top")

# newplot
p2 = p1 +labs(title="meanLifeExp by continent after arrange")+
  mytheme
p2

comparing to the former one:

gapminder %>%
    group_by(continent, year) %>%
    summarise(meanLifeExp = mean(lifeExp)) %>%
    arrange(desc(continent)) %>%
    ggplot(aes(continent, y = meanLifeExp)) +
    geom_boxplot(fill= "lightblue",width=0.5,alpha=0.5)+
    theme_bw()+
    labs(title="meanLifeExp by continent after arrange")

Then, make a new graph by converting this visual (or another, if you’d like) to a plotly graph. What are some things that plotly makes possible, that are not possible with a regular ggplot2 graph?

ggplotly(p2)

we can see that in plotly plots, there are some interactions.

we can add more dynamic interactions into plotly. For the ggplot2 in the last one is a mess but the plotky can make it readable.

#ggplot2
p3 = ggplot(gapminder, aes(gdpPercap, lifeExp, color = continent, frame = year)) +
  geom_point(aes(size = pop, ids = country)) +
  geom_smooth(se = FALSE, method = "lm") +
  scale_x_log10(labels= dollar_format())+
  theme_bw()
## Warning: Ignoring unknown aesthetics: ids
p3

#plotly
plotly::ggplotly(p3) %>% 
  plotly::highlight("plotly_hover")

writing figures to file

We can save the last plot into png/pdf/jpeg file by using ggsave

# to save plot in png
ggsave("plotly.png", width = 8, height = 4, dpi="retina")
#to save plot in pdf
ggsave("plotly.png.pdf", width = 8, height = 4)
# in jpeg
ggsave("plotly.png.jpeg", width = 8, height = 4)